Dynamic Plotting Fashion Brands Patterns on Instagram

My goal is use interactive vizualitations to find patterns on Instagram Posts related to fashion brands.

Data obtained from the project https://arxiv.org/abs/1704.04137. In their data project they obtained 24,752 Instagram posts by 13,350 people on Instagram. The data collection was done over a month period in January, 2015. And in all the posts renowned fashion brand are named in the hashtags.

The data includes:

  • Basic information of the posts (brand name, brand catgory, likes, comments, user id, followings, followers, captions, hashtags, and creation time).
  • Learned features from their model (identifies the kind of picture).
  • Learned features from Microsof emotion API (identifies the emotion of the people that appears in the picture).
In [1]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import altair as alt
from altair import *
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
from IPython.display import display
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
#import re
from string import digits 
import spacy
import en_core_web_sm
import nltk
from nltk.corpus import stopwords


print('Loading words, spacy, punktd, stopwords')
nltk.download('words')
nlp = en_core_web_sm.load()
words = set(nltk.corpus.words.words())
nltk.download('stopwords')
nltk.download('punkt')
print('done, now loading text and basic formating of columns name')
# Read dataset and format texts 
df = pd.read_csv('fashion data on instagram.csv', index_col=0)

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('?', '')
df.brandname = df.brandname.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('?', '')
df.brandcategory = df.brandcategory.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('?', '')
print(df.columns)  #print(df.shape)
print('ready!!')
Loading words, spacy, punktd, stopwords
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\thalia\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thalia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thalia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
done, now loading text and basic formating of columns name
Index(['followings', 'followers', 'mediacount', 'brandname', 'brandcategory',
       'hashtags', 'caption', 'imgurl', 'likes', 'comments', 'creationtime',
       'link', 'selfie', 'bodysnap', 'marketing', 'productonly', 'nonfashion',
       'face', 'logo', 'brandlogo', 'smile', 'outdoor', 'numberofpeople',
       'numberoffashionproduct', 'anger', 'contempt', 'disgust', 'fear',
       'happiness', 'neutral', 'sadness', 'surprise'],
      dtype='object')
ready!!
In [2]:
alt.data_transformers.disable_max_rows()

print('Altair interactive plot')
print('click in the brandcategory to highlight the point in the category')
print('mouse over a data point to see the brand name')
print('scroll over a region to zoom in.')
selector = alt.selection_single(empty='all', fields=['brandcategory'])
colours_obj = alt.Color( 'brandcategory:N')#,scale=alt.Scale(domain=parties,range=party_colours))
colours_condition = alt.condition(selector, 
                                  colours_obj,       
                                  alt.value("lightgray"))

alt.Chart(df[(df['followers']<=500000) & (df['likes']<=20000)]).mark_point().encode(
    alt.X('followers:Q',
        scale=alt.Scale(domain=(-5, 500000))),
    alt.Y('likes:Q',
          scale=alt.Scale(domain=(-5,20000))),
    size=alt.Size('comments:Q',
        scale=alt.Scale(range=(20,500))),
    color=colours_condition,
    tooltip=['brandname:N','followers','likes','comments']).interactive().add_selection(selector)
Altair interactive plot
click in the brandcategory to highlight the point in the category
mouse over a data point to see the brand name
scroll over a region to zoom in.
Out[2]:
In [3]:
### these are my plotting functions that I'm going to call with interact

####function for pie plot for categories
def pies_cat(frame,mytitle):
    print('Preparing pie plot for',mytitle)
    return frame.plot.pie(labels=list1,shadow=False,autopct='%1.1f%%',
                                   textprops={'fontsize': 14},title=mytitle).set_ylabel('')

#####function for dynamic pie plot, it changes brand category and display all the brands in the category
def pies_brand(Category, mytitle,frame,col):
         print('Preparing pie plot, you can change the Category =)' )
         return (frame[frame['brandcategory']==Category][col].plot.pie(labels=frame['brandname'],
         shadow=False,textprops={'fontsize': 14},title=mytitle)).set_ylabel('')

We verified that we have data from 24752 posts. First thing that I want to explore is the percentage of posts that correspond to each brand and brand category.

In [4]:
list1=['designer', 'high_street', 'mega_couture', 'small_couture']

#grouping by category and counting posts (entries in my df)
posts_cat=df.groupby(['brandcategory']).count()
#grouping by category and brand, and counting posts (entries in  my df)
posts_brand=df.groupby(['brandcategory','brandname']).count()
posts_brand.reset_index(inplace=True) 
    
pies_cat(posts_cat['followers'],'Posts per Category')
interact(pies_brand, Category=list1, mytitle=fixed('Post Per Brand'), frame=fixed(posts_brand[['brandcategory','brandname','followers']]),
         col=fixed('followers'))
Preparing pie plot for Posts per Category
Out[4]:
<function __main__.pies_brand(Category, mytitle, frame, col)>
In [5]:
#grouping by category and adding likes and comments
sum_cat=df.groupby(['brandcategory']).sum()
#grouping by category and brand,  and adding likes and comments
sum_brand=df.groupby(['brandcategory','brandname']).sum()
sum_brand.reset_index(inplace=True)

pies_cat(sum_cat['likes'],'Likes per Category')
interact(pies_brand, Category=list1, mytitle=fixed('Likes Per Brand'), frame=fixed(sum_brand[['brandcategory','brandname','likes']]),
         col=fixed('likes'))

pies_cat(sum_cat['comments'],'Comments per Category')
interact(pies_brand, Category=list1, mytitle=fixed('Comments Per Brand'), frame=fixed(sum_brand[['brandcategory','brandname','comments']]),
         col=fixed('comments'))
Preparing pie plot for Likes per Category
Preparing pie plot for Comments per Category
Out[5]:
<function __main__.pies_brand(Category, mytitle, frame, col)>
In [6]:
###im going to clean just the text need it for the clouds,
####these are the different functions for cleaning that i call in full cleaning function

def remove_stopwords(text):
    stopword_list = stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return ' '.join([token for token in tokens if token not in stopword_list])

def lemmatize(text):
    text = nlp(text)
    return ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])

def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

###       function to do all the cleaning    ################################
def full_cleaning(Category,frame,col):
    print('cleaning....')
    newframe=frame.copy()  
    newframe[col] = newframe[col].str.replace('\d+', '').str.replace('?', '').str.replace('\W', ' ').str.lower()
    newframe = newframe[newframe['brandcategory']==Category].astype(str)
    text = ' '.join(newframe[col][:])
    cleantext = remove_stopwords(text)
    words = set(nltk.corpus.words.words())
    ## cleantext=lemmatize(cleantext)   not enough ram
    cleantext = clean_sent(cleantext)
    print('done!')
    return cleantext

###       funtion for cloud that Im going to call with interact    ############################
def makingclouds(Category,frame,col,maximum,atitle):
    cloudtext=full_cleaning(Category,frame,col)
    print('preparing cloud')
    print('Cloud for ',Category, 'n.n')
    wordcloud = WordCloud(max_font_size=40, max_words=maximum, background_color="white",collocations=False).generate(cloudtext)
    wordcloud.generate_from_frequencies
    plt.style.use("dark_background")
    plt.figure()
    plt.title(atitle, fontsize=40)#,fontweight="bold")
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
In [7]:
interact(makingclouds, Category=list1, df=fixed(df), col=fixed('caption'),maximum=[20,50,100,150],
         atitle=fixed('Most used words in captions'), frame=fixed(df[['brandcategory','caption']]))
Out[7]:
<function __main__.makingclouds(Category, frame, col, maximum, atitle)>

Add word cloud add altair try lemmatize

In [8]:
interact(makingclouds, Category=list1, df=fixed(df), col=fixed('hashtags'),maximum=[20,50,100,150],
         atitle=fixed('Most used words in hashtags'), frame=fixed(df[['brandcategory','hashtags']]))
Out[8]:
<function __main__.makingclouds(Category, frame, col, maximum, atitle)>